Airbnb Rome EDA

Author

Maan Al Neami,
Nourah Almutairi,
Lina Alhuri,
Asma AlQahtani,
Yousef Alotaibi



1-Data Pre-processing

Code
# Importing libraries 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import *
import scipy.stats as stats
from sklearn.metrics import *
import sklearn
from pprint import pprint
import autosklearn
import autosklearn.classification
import inspect
from autosklearn.pipeline.components.classification import ClassifierChoice
from sklearn.preprocessing import LabelEncoder
import PipelineProfiler
Code
# Importing data
df = pd.read_csv("../data/train.csv")
test = pd.read_csv('../data/test.csv')

df.head()
Popularity danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo duration_in min/ms time_signature Class
count 17568.000000 17996.000000 17996.000000 15982.000000 17996.000000 17996.000000 17996.000000 17996.000000 13619.000000 17996.000000 17996.000000 17996.000000 1.799600e+04 17996.000000 17996.000000
mean 44.512124 0.543433 0.662777 5.952447 -7.910660 0.636753 0.079707 0.247082 0.177562 0.196170 0.486208 122.623294 2.007445e+05 3.924039 6.695821
std 17.426928 0.166268 0.235373 3.196854 4.049151 0.480949 0.083576 0.310632 0.304048 0.159212 0.240195 29.571527 1.119891e+05 0.361618 3.206073
min 1.000000 0.059600 0.000020 1.000000 -39.952000 0.000000 0.022500 0.000000 0.000001 0.011900 0.018300 30.557000 5.016500e-01 1.000000 0.000000
25% 33.000000 0.432000 0.509000 3.000000 -9.538000 0.000000 0.034800 0.004300 0.000089 0.097500 0.297000 99.620750 1.663370e+05 4.000000 5.000000
50% 44.000000 0.545000 0.700000 6.000000 -7.016000 1.000000 0.047400 0.081400 0.003910 0.129000 0.481000 120.065500 2.091600e+05 4.000000 8.000000
75% 56.000000 0.659000 0.860000 9.000000 -5.189000 1.000000 0.083000 0.434000 0.200000 0.258000 0.672000 141.969250 2.524900e+05 4.000000 10.000000
max 100.000000 0.989000 1.000000 11.000000 1.355000 1.000000 0.955000 0.996000 0.996000 1.000000 0.986000 217.416000 1.477187e+06 5.000000 10.000000
Code
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17996 entries, 0 to 17995
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Artist Name         17996 non-null  object 
 1   Track Name          17996 non-null  object 
 2   Popularity          17568 non-null  float64
 3   danceability        17996 non-null  float64
 4   energy              17996 non-null  float64
 5   key                 15982 non-null  float64
 6   loudness            17996 non-null  float64
 7   mode                17996 non-null  int64  
 8   speechiness         17996 non-null  float64
 9   acousticness        17996 non-null  float64
 10  instrumentalness    13619 non-null  float64
 11  liveness            17996 non-null  float64
 12  valence             17996 non-null  float64
 13  tempo               17996 non-null  float64
 14  duration_in min/ms  17996 non-null  float64
 15  time_signature      17996 non-null  int64  
 16  Class               17996 non-null  int64  
dtypes: float64(12), int64(3), object(2)
memory usage: 2.3+ MB
Code
df.describe()

Change column values

Code
# Filter all the rows that have duration value less than "30"
df.loc[(df['duration_in min/ms'] < 30)]['duration_in min/ms']
7        3.105783
10       4.330450
13       4.440250
25       4.015633
34       3.503783
           ...   
17952    5.407783
17959    3.686017
17974    3.408667
17986    4.392883
17988    3.787783
Name: duration_in min/ms, Length: 2580, dtype: float64
Code
condition = df['duration_in min/ms'] < 30 

# If the value in duration_in_min/ms column is less than 30, then multiply the value with 60,000
df.loc[condition,'duration_in min/ms'] = df.loc[condition,'duration_in min/ms']*60000
Code
df.rename(columns={"duration_in min/ms": "duration_in_ms"}, inplace=True)
Code
#df.drop('index', axis=1, inplace=True)

Checking for duplicates

Code
df.duplicated().sum()
3
Code
df = df.drop_duplicates()
df.shape
(17993, 17)

Columns with NaN values

  • Popularity
  • key
  • instrumentalness
Code
df_dist = df.select_dtypes([int,float]) 
fig = plt.figure(figsize=(15, 18)) 
try:
    for i,columns in enumerate(df_dist.columns, 1): 
        ax = plt.subplot(5,3,i) 
        sns.kdeplot(x=df_dist[columns])
        ax.set_xlabel(None) 
        ax.set_title(f'{columns}')
        plt.tight_layout(w_pad=3) 
except ValueError as ve:
    exit;      
plt.show()

Imputing NaN values

  • Popularity and Key is normally distributed, so we’ll use the mean to fill NA values.
  • instrumentalness seems to be positively skewed, so the median would be a good option to fill NA values
Code
df['Popularity'].fillna(df['Popularity'].mean(), inplace=True)
df['key'].fillna(df['key'].mean(), inplace=True)
df['instrumentalness'].fillna(df['instrumentalness'].median(), inplace=True)
Code
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 17993 entries, 0 to 17995
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Artist Name       17993 non-null  object 
 1   Track Name        17993 non-null  object 
 2   Popularity        17993 non-null  float64
 3   danceability      17993 non-null  float64
 4   energy            17993 non-null  float64
 5   key               17993 non-null  float64
 6   loudness          17993 non-null  float64
 7   mode              17993 non-null  int64  
 8   speechiness       17993 non-null  float64
 9   acousticness      17993 non-null  float64
 10  instrumentalness  17993 non-null  float64
 11  liveness          17993 non-null  float64
 12  valence           17993 non-null  float64
 13  tempo             17993 non-null  float64
 14  duration_in_ms    17993 non-null  float64
 15  time_signature    17993 non-null  int64  
 16  Class             17993 non-null  int64  
dtypes: float64(12), int64(3), object(2)
memory usage: 3.0+ MB
Code
df.describe()
Popularity danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo duration_in_ms time_signature Class
count 17993.000000 17993.000000 17993.000000 17993.000000 17993.000000 17993.000000 17993.000000 17993.00000 17993.000000 17993.000000 17993.000000 17993.000000 1.799300e+04 17993.000000 17993.000000
mean 44.508511 0.543444 0.662765 5.952625 -7.910629 0.636692 0.079706 0.24708 0.135348 0.196119 0.486172 122.622402 2.358326e+05 3.924026 6.695326
std 17.216948 0.166279 0.235390 3.012822 4.049361 0.480966 0.083580 0.31065 0.274809 0.159170 0.240190 29.572273 8.567863e+04 0.361647 3.206105
min 1.000000 0.059600 0.000020 1.000000 -39.952000 0.000000 0.022500 0.00000 0.000001 0.011900 0.018300 30.557000 2.332000e+04 1.000000 0.000000
25% 33.000000 0.432000 0.509000 4.000000 -9.538000 0.000000 0.034800 0.00430 0.000294 0.097500 0.297000 99.622000 1.876670e+05 4.000000 5.000000
50% 44.000000 0.545000 0.700000 5.952625 -7.016000 1.000000 0.047400 0.08140 0.003910 0.129000 0.480000 120.065000 2.200000e+05 4.000000 8.000000
75% 56.000000 0.659000 0.860000 8.000000 -5.189000 1.000000 0.083000 0.43400 0.056600 0.258000 0.672000 141.969000 2.630890e+05 4.000000 10.000000
max 100.000000 0.989000 1.000000 11.000000 1.355000 1.000000 0.955000 0.99600 0.996000 1.000000 0.986000 217.416000 1.793160e+06 5.000000 10.000000

Transformation

Code
df_dist = df.select_dtypes([int,float]) 
fig = plt.figure(figsize=(15, 18)) 
try:
    for i,columns in enumerate(df_dist.columns, 1): 
        ax = plt.subplot(5,3,i) 
        stats.probplot(df_dist[columns], dist='norm', plot=plt)
        ax.set_xlabel(None) 
        ax.set_title(f'{columns}')
        plt.tight_layout(w_pad=3) 
except ValueError as ve:
    exit;      
plt.show()

Code
df_skew = df.select_dtypes([int,float])

for i in df_skew.columns:
  print(f'{i} =',df_skew[i].skew())
Popularity = 0.07669558518544822
danceability = -0.08369345304304482
energy = -0.6610150352462746
key = -0.05850647363717779
loudness = -1.7614193691127105
mode = -0.5684696527675345
speechiness = 3.0880448562768334
acousticness = 1.105517996081122
instrumentalness = 1.9725302906570672
liveness = 2.177417778948198
valence = 0.0900847395181493
tempo = 0.37973839001127835
duration_in_ms = 4.028792955829684
time_signature = -4.18230862524085
Class = -0.6664557863042111

Features with high skewness and are not normally distributed(Data points going beyond the line):

  • loudness
  • speechiness
  • acousticness
  • instrumentalness
  • liveness
  • duration_in_ms #### Note: A skewness value greater than 1 or less than -1 indicates a highly skewed distribution
Code
df["loudness"] = np.cbrt(df["loudness"]) 
df["speechiness"] = np.sqrt(df["speechiness"])
df["acousticness"] = np.sqrt(df["acousticness"])
df["instrumentalness"] = np.log(df["instrumentalness"]) 
df["liveness"] = np.log(df["liveness"]) 
df["duration_in_ms"] = np.log(df["duration_in_ms"])

Distribution of features after transformation

Code
df_dist_t = df.select_dtypes([int,float]) 
fig = plt.figure(figsize=(15, 18)) 
try:
    for i,columns in enumerate(df_dist.columns, 1): 
        ax1 = plt.subplot(5,3,i) 
        ax1 = sns.kdeplot(x=df_dist[columns])
        ax2 = ax1.twinx()
        ax2 = sns.kdeplot(x=df_dist_t[columns], color='red')
        ax.set_xlabel(None) 
        ax.set_title(f'{columns}')
        plt.tight_layout(w_pad=3) 
except ValueError as ve:
    exit;      
plt.show()

QQ plot after transformation

Code
df_dist = df.select_dtypes([int,float]) 
fig = plt.figure(figsize=(15, 18)) 
try:
    for i,columns in enumerate(df_dist.columns, 1): 
        ax = plt.subplot(5,3,i) 
        stats.probplot(df_dist[columns], dist='norm', plot=plt)
        ax.set_xlabel(None) 
        ax.set_title(f'{columns}')
        plt.tight_layout(w_pad=3) 
except ValueError as ve:
    exit;      
plt.show()

Code
df.head()
Artist Name Track Name Popularity danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo duration_in_ms time_signature Class
0 Bruno Mars That's What I Like (feat. Gucci Mane) 60.0 0.854 0.564 1.0 -1.705862 1 0.220227 0.130767 -5.544218 -2.466281 0.8990 134.071 12.365620 4 5
1 Boston Hitch a Ride 54.0 0.382 0.814 3.0 -1.933657 1 0.201494 0.033166 -5.518964 -2.292635 0.5690 116.454 12.436124 4 10
2 The Raincoats No Side to Fall In 35.0 0.434 0.614 6.0 -2.027455 1 0.229129 0.697137 -8.537396 -0.931404 0.7870 147.681 11.605204 4 6
3 Deno Lingo (feat. J.I & Chunkz) 66.0 0.853 0.597 10.0 -1.868931 0 0.235584 0.145602 -5.544218 -2.103734 0.5690 107.033 12.066627 4 5
4 Red Hot Chili Peppers Nobody Weird Like Me - Remastered 53.0 0.167 0.975 2.0 -1.623482 1 0.464758 0.013000 -4.128936 -1.760261 0.0918 199.060 12.345661 4 10
Code
df_cont = df.select_dtypes([int,float]) 

fig = plt.figure(figsize=(10, 10)) 
for i,columns in enumerate(df_cont.columns, 1): 
   ax = plt.subplot(5,3,i) 
   sns.boxplot(data = df_cont, x=df_cont[columns]) 
   ax.set_xlabel(None) 
   ax.set_title(f'Distribution of {columns}') 
   plt.tight_layout(w_pad=3) 
plt.show() 

Code
plt.figure(figsize=(16, 7))
sns.heatmap(df.corr(), annot = True)
<AxesSubplot:>

It seems that there is no strong correlations between the target variable and the features

EDA Conclusion

  • There are 6 features not normally distributed and need transformation
  • There some outliers that we might need to handle
  • There is no strong correlation between the target and features

Categorical data encoding

Code
columns = ["Artist Name","Track Name"]
enc = LabelEncoder()
for col in columns:
  df[col] = enc.fit_transform(df[col])

feature engineering

Code
# from sklearn.preprocessing import StandardScaler

# # Center and scale (i.e., standardize) all numeric features excepte Class column 
# Y=df['Class']
# df=df.drop(['Class'] , axis=1)

# scaler = StandardScaler()
# scaler.fit(df)

# df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
 
 
# df['Class']=Y
# df

Building the model using autoML

Data splitting

Code
x_train = df.drop('Class' ,axis = 1)
y_train = df['Class']
Code
test_data = test.fillna(0)
Code
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.20, random_state = 11)
Code
X_train.shape, y_train.shape
((14394, 16), (14394,))
Code
X_val.shape, y_val.shape
((3599, 16), (3599,))
Code
#!pip install auto-sklearn
Code
#!pip install pipelineprofiler

AutoML

Auto-sklearn is Built around the scikit-learn machine learning library and it’s a tool tool to fully automate the process of machine learning, it also searches for the right learning algorithm for our dataset and optimizes the hyperparameters.

Code
inspect.signature(autosklearn.classification.AutoSklearnClassifier)
<Signature (time_left_for_this_task=3600, per_run_time_limit=None, initial_configurations_via_metalearning=25, ensemble_size: int = 50, ensemble_nbest=50, max_models_on_disc=50, seed=1, memory_limit=3072, include: Union[Dict[str, List[str]], NoneType] = None, exclude: Union[Dict[str, List[str]], NoneType] = None, resampling_strategy='holdout', resampling_strategy_arguments=None, tmp_folder=None, delete_tmp_folder_after_terminate=True, n_jobs: Union[int, NoneType] = None, dask_client: Union[distributed.client.Client, NoneType] = None, disable_evaluator_output=False, get_smac_object_callback=None, smac_scenario_args=None, logging_config=None, metadata_directory=None, metric=None, scoring_functions: Union[List[autosklearn.metrics.Scorer], NoneType] = None, load_models: bool = True, get_trials_callback=None)>
Code
for name in ClassifierChoice.get_components():
    print(name)
adaboost
bernoulli_nb
decision_tree
extra_trees
gaussian_nb
gradient_boosting
k_nearest_neighbors
lda
liblinear_svc
libsvm_svc
mlp
multinomial_nb
passive_aggressive
qda
random_forest
sgd

Build and fit a classifier

We have used two parameters which are:

1- time_left_for_this_task
Time limit in seconds for the search of appropriate models. By increasing this value, auto-sklearn has a higher chance of finding better models and we have set it to 600

2- per_run_time_limit
Time limit for a single call to the machine learning model. Model fitting will be terminated if the machine learning algorithm runs over the time limit and we have decided to set it to 70

Code
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=600, per_run_time_limit=70)
Code
automl.fit(X_train, y_train)
/opt/conda/lib/python3.9/site-packages/autosklearn/metalearning/metalearning/meta_base.py:68: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  self.metafeatures = self.metafeatures.append(metafeatures)
/opt/conda/lib/python3.9/site-packages/autosklearn/metalearning/metalearning/meta_base.py:72: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  self.algorithm_runs[metric].append(runs)
AutoSklearnClassifier(per_run_time_limit=70, time_left_for_this_task=600)

the final ensemble constructed by auto-sklearn

To get the best performance out of the evaluated models, auto-sklearn uses ensemble selection to build an ensemble models which is a machine learning approach to combine multiple other models in the prediction process.

show_models() return a dictionary containing dictionaries of ensemble models, and we can see here that we have ten models, AdaBoostClassifier, MLPClassifier, HistGradientBoostingClassifier with five different set of hyperparameters, PassiveAggressiveClassifier and BernoulliNB.

A model dictionary contains the following:

1- “model_id”: The id given to a model by autosklearn.

2- “rank”: The rank of the model based on it’s “cost”.

3- “cost”: The loss of the model on the validation set.

4- “ensemble_weight”: The weight given to the model in the ensemble.

5- “voting_model”: The cv_voting_ensemble model (for ‘cv’ resampling).

6- “estimators”: List of models (dicts) in cv_voting_ensemble (for ‘cv’ resampling).

7- “data_preprocessor”: The preprocessor used on the data.

8- “balancing”: The balancing used on the data (for classification).

9- “feature_preprocessor”: The preprocessor for features types.

10- “classifier” or “regressor”: The autosklearn wrapped classifier or regressor.

11- “sklearn_classifier” or “sklearn_regressor”: The sklearn classifier or regressor.

Code
pprint(automl.show_models(), indent=4)
{   7: {   'balancing': Balancing(random_state=1, strategy='weighting'),
           'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea8ace760>,
           'cost': 0.9254893706588087,
           'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea8d303d0>,
           'ensemble_weight': 0.02,
           'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea8ace580>,
           'model_id': 7,
           'rank': 10,
           'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=False,
                               l2_regularization=0.0014846041861678746,
                               learning_rate=0.13443662321690814, max_iter=512,
                               max_leaf_nodes=47, min_samples_leaf=8,
                               n_iter_no_change=0, random_state=1,
                               validation_fraction=None, warm_start=True)},
    10: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea8e9b2b0>,
            'cost': 0.9063355083140392,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea8d41280>,
            'ensemble_weight': 0.38,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea8e9b1f0>,
            'model_id': 10,
            'rank': 9,
            'sklearn_classifier': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.0285269469336831, n_estimators=105,
                   random_state=1)},
    11: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea990ddf0>,
            'cost': 0.544516943801305,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea9923130>,
            'ensemble_weight': 0.04,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea990d430>,
            'model_id': 11,
            'rank': 3,
            'sklearn_classifier': MLPClassifier(activation='tanh', alpha=3.198483470889531e-06, beta_1=0.999,
              beta_2=0.9, hidden_layer_sizes=(24, 24),
              learning_rate_init=0.006604847357173181, max_iter=128,
              n_iter_no_change=32, random_state=1, validation_fraction=0.0,
              verbose=0, warm_start=True)},
    19: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea962cd90>,
            'cost': 0.5777731003999158,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea9917bb0>,
            'ensemble_weight': 0.02,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea962cc70>,
            'model_id': 19,
            'rank': 5,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=1.7049772538367706e-08,
                               learning_rate=0.0825755415435688, max_iter=64,
                               max_leaf_nodes=51, min_samples_leaf=121,
                               random_state=1,
                               validation_fraction=0.15078023719798528,
                               warm_start=True)},
    20: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea9913820>,
            'cost': 0.49610608292990954,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea99269a0>,
            'ensemble_weight': 0.08,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea9913f10>,
            'model_id': 20,
            'rank': 1,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=9.674948183980905e-09,
                               learning_rate=0.014247987845444413, max_iter=256,
                               max_leaf_nodes=55, min_samples_leaf=164,
                               n_iter_no_change=1, random_state=1,
                               validation_fraction=0.11770489601182355,
                               warm_start=True)},
    21: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea99048b0>,
            'cost': 0.5735634603241423,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea9924c40>,
            'ensemble_weight': 0.08,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea99041c0>,
            'model_id': 21,
            'rank': 4,
            'sklearn_classifier': PassiveAggressiveClassifier(C=0.0007163174331946707, max_iter=128,
                            random_state=1, tol=1.0000041320668022e-05,
                            warm_start=True)},
    25: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea8d41dc0>,
            'cost': 0.7272153230898758,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea962c0a0>,
            'ensemble_weight': 0.1,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea8d41cd0>,
            'model_id': 25,
            'rank': 7,
            'sklearn_classifier': SGDClassifier(alpha=0.0018367485569817758, average=True,
              epsilon=0.06935467259367672, eta0=3.0158489869733344e-07,
              learning_rate='constant', loss='modified_huber', max_iter=128,
              penalty='l1', random_state=1, tol=5.2594585460405347e-05,
              warm_start=True)},
    28: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea9913e50>,
            'cost': 0.5320985055777732,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea96bbd00>,
            'ensemble_weight': 0.04,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea9913340>,
            'model_id': 28,
            'rank': 2,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=1.9961019888016454e-06,
                               learning_rate=0.3687611240638255, max_iter=16,
                               max_leaf_nodes=5, min_samples_leaf=1,
                               n_iter_no_change=7, random_state=1,
                               validation_fraction=0.11535434774390946,
                               warm_start=True)},
    29: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea8c40040>,
            'cost': 0.8987581561776468,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea90851c0>,
            'ensemble_weight': 0.22,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea8d30f10>,
            'model_id': 29,
            'rank': 8,
            'sklearn_classifier': BernoulliNB(alpha=39.87397441278958, fit_prior=False)},
    30: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7fcea9085cd0>,
            'cost': 0.6674384340138918,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7fcea99092e0>,
            'ensemble_weight': 0.02,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7fcea9085bb0>,
            'model_id': 30,
            'rank': 6,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=0.019135985125184364,
                               learning_rate=0.020108567827295615, max_iter=256,
                               max_leaf_nodes=47, min_samples_leaf=80,
                               n_iter_no_change=14, random_state=1,
                               validation_fraction=0.39266833442067606,
                               warm_start=True)}}
Code
profiler_data = PipelineProfiler.import_autosklearn(automl)
PipelineProfiler.plot_pipeline_matrix(profiler_data)

Results

Code
print(automl.leaderboard())
          rank  ensemble_weight                type      cost   duration
model_id                                                                
20           1             0.08   gradient_boosting  0.496106  26.036272
28           2             0.04   gradient_boosting  0.532099   1.114002
11           3             0.04                 mlp  0.544517   7.933256
21           4             0.08  passive_aggressive  0.573563   3.855479
19           5             0.02   gradient_boosting  0.577773   7.072537
30           6             0.02   gradient_boosting  0.667438   9.900146
25           7             0.10                 sgd  0.727215  38.394378
29           8             0.22        bernoulli_nb  0.898758   1.411935
10           9             0.38            adaboost  0.906336  12.885744
7           10             0.02   gradient_boosting  0.925489  62.687397
Code
y_hat = automl.predict(X_val)
print("Accuracy score", sklearn.metrics.accuracy_score(y_val, y_hat))
print(automl.sprint_statistics())
Accuracy score 0.513198110586274
auto-sklearn results:
  Dataset name: 51304a8c-184b-11ed-836a-bb3acf05faab
  Metric: accuracy
  Best validation score: 0.503894
  Number of target algorithm runs: 29
  Number of successful target algorithm runs: 12
  Number of crashed target algorithm runs: 7
  Number of target algorithms that exceeded the time limit: 3
  Number of target algorithms that exceeded the memory limit: 7
Code
cv_results = pd.DataFrame.from_dict(automl.cv_results_)
cv_results
mean_test_score mean_fit_time params rank_test_scores status budgets param_balancing:strategy param_classifier:__choice__ param_data_preprocessor:__choice__ param_feature_preprocessor:__choice__ ... param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_max param_data_preprocessor:feature_type:numerical_transformer:rescaling:robust_scaler:q_min param_feature_preprocessor:fast_ica:n_components param_feature_preprocessor:kernel_pca:coef0 param_feature_preprocessor:kernel_pca:degree param_feature_preprocessor:kernel_pca:gamma param_feature_preprocessor:nystroem_sampler:coef0 param_feature_preprocessor:nystroem_sampler:degree param_feature_preprocessor:nystroem_sampler:gamma param_feature_preprocessor:select_rates_classification:mode
0 0.000000 3.074913 {'balancing:strategy': 'none', 'classifier:__c... 13 Memout 0.0 none random_forest feature_type no_preprocessing ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0.000000 10.207830 {'balancing:strategy': 'weighting', 'classifie... 13 Crash 0.0 weighting libsvm_svc feature_type select_rates_classification ... 0.759067 0.238219 NaN NaN NaN NaN NaN NaN NaN fdr
2 0.481372 33.871870 {'balancing:strategy': 'weighting', 'classifie... 2 Success 0.0 weighting gradient_boosting feature_type feature_agglomeration ... 0.766357 0.255619 NaN NaN NaN NaN NaN NaN NaN NaN
3 0.000000 1.135846 {'balancing:strategy': 'none', 'classifier:__c... 13 Crash 0.0 none lda feature_type kitchen_sinks ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 0.000000 69.996468 {'balancing:strategy': 'weighting', 'classifie... 13 Timeout 0.0 weighting gradient_boosting feature_type no_preprocessing ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 0.074511 62.687397 {'balancing:strategy': 'weighting', 'classifie... 12 Success 0.0 weighting gradient_boosting feature_type select_rates_classification ... NaN NaN NaN NaN NaN NaN NaN NaN NaN fpr
6 0.000000 9.781259 {'balancing:strategy': 'weighting', 'classifie... 13 Crash 0.0 weighting libsvm_svc feature_type no_preprocessing ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
7 0.000000 2.469418 {'balancing:strategy': 'none', 'classifier:__c... 13 Crash 0.0 none gradient_boosting feature_type polynomial ... 0.967364 0.204455 NaN NaN NaN NaN NaN NaN NaN NaN
8 0.093664 12.885744 {'balancing:strategy': 'weighting', 'classifie... 11 Success 0.0 weighting adaboost feature_type fast_ica ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
9 0.455483 7.933256 {'balancing:strategy': 'none', 'classifier:__c... 4 Success 0.0 none mlp feature_type liblinear_svc_preprocessor ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
10 0.000000 70.004292 {'balancing:strategy': 'weighting', 'classifie... 13 Timeout 0.0 weighting gradient_boosting feature_type polynomial ... 0.965272 0.041903 NaN NaN NaN NaN NaN NaN NaN NaN
11 0.000000 65.689515 {'balancing:strategy': 'weighting', 'classifie... 13 Memout 0.0 weighting gradient_boosting feature_type polynomial ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
12 0.000000 4.542973 {'balancing:strategy': 'weighting', 'classifie... 13 Memout 0.0 weighting random_forest feature_type feature_agglomeration ... 0.750000 0.250000 NaN NaN NaN NaN NaN NaN NaN NaN
13 0.000000 1.983258 {'balancing:strategy': 'weighting', 'classifie... 13 Crash 0.0 weighting lda feature_type kitchen_sinks ... 0.889845 0.170408 NaN NaN NaN NaN NaN NaN NaN NaN
14 0.000000 1.917242 {'balancing:strategy': 'none', 'classifier:__c... 13 Memout 0.0 none random_forest feature_type feature_agglomeration ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
15 0.000000 4.938949 {'balancing:strategy': 'none', 'classifier:__c... 13 Memout 0.0 none random_forest feature_type liblinear_svc_preprocessor ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
16 0.000000 1.268959 {'balancing:strategy': 'none', 'classifier:__c... 13 Crash 0.0 none gradient_boosting feature_type fast_ica ... NaN NaN 588.0 NaN NaN NaN NaN NaN NaN NaN
17 0.422227 7.072537 {'balancing:strategy': 'none', 'classifier:__c... 7 Success 0.0 none gradient_boosting feature_type select_percentile_classification ... 0.810188 0.236563 NaN NaN NaN NaN NaN NaN NaN NaN
18 0.503894 26.036272 {'balancing:strategy': 'none', 'classifier:__c... 1 Success 0.0 none gradient_boosting feature_type feature_agglomeration ... 0.883087 0.122059 NaN NaN NaN NaN NaN NaN NaN NaN
19 0.426437 3.855479 {'balancing:strategy': 'none', 'classifier:__c... 6 Success 0.0 none passive_aggressive feature_type polynomial ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
20 0.000000 10.038196 {'balancing:strategy': 'weighting', 'classifie... 13 Crash 0.0 weighting libsvm_svc feature_type no_preprocessing ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
21 0.000000 4.405212 {'balancing:strategy': 'weighting', 'classifie... 13 Memout 0.0 weighting random_forest feature_type select_rates_classification ... 0.735635 0.290211 NaN NaN NaN NaN NaN NaN NaN NaN
22 0.443696 13.309701 {'balancing:strategy': 'weighting', 'classifie... 5 Success 0.0 weighting gradient_boosting feature_type no_preprocessing ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
23 0.272785 38.394378 {'balancing:strategy': 'weighting', 'classifie... 9 Success 0.0 weighting sgd feature_type polynomial ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
24 0.000000 8.215454 {'balancing:strategy': 'none', 'classifier:__c... 13 Memout 0.0 none random_forest feature_type polynomial ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25 0.000000 70.053956 {'balancing:strategy': 'none', 'classifier:__c... 13 Timeout 0.0 none random_forest feature_type polynomial ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
26 0.467901 1.114002 {'balancing:strategy': 'none', 'classifier:__c... 3 Success 0.0 none gradient_boosting feature_type no_preprocessing ... 0.963468 0.004092 NaN NaN NaN NaN NaN NaN NaN NaN
27 0.101242 1.411935 {'balancing:strategy': 'weighting', 'classifie... 10 Success 0.0 weighting bernoulli_nb feature_type select_rates_classification ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
28 0.332562 9.900146 {'balancing:strategy': 'weighting', 'classifie... 8 Success 0.0 weighting gradient_boosting feature_type select_percentile_classification ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

29 rows × 174 columns

Traditional ML

Fitting using default parameters

Code
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
Code
clf = MLPClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))
0.4459572103362045
Code
clf = SVC()

clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))
0.2836899138649625
Code
clf = MLPClassifier()

clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))
0.43706585162545153
Code
clf = GaussianNB()

clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))
0.4090025006946374
Code
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))
0.43734370658516253
Code
# the highest
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))
0.4901361489302584
Code
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))
0.21700472353431507

Looking for the best K value for KNN

Code
K = np.arange(2, 21)

training_score = []
validation_score = []

for i in K:
    knn = KNeighborsClassifier(n_neighbors = i) 
    knn.fit(X_train, y_train)
    training_score.append(knn.score(X_train, y_train))
    validation_score.append(knn.score(X_val, y_val))
Code
hyper_params_knn = pd.DataFrame({'No of Neighbors' : K, 'Training Score' : np.array(training_score), 'Validation Score' : np.array(validation_score)}, columns = ['No of Neighbors', 'Training Score', 'Validation Score'])
hyper_params_knn[(hyper_params_knn['Validation Score'] == hyper_params_knn['Validation Score'].max())]
No of Neighbors Training Score Validation Score
18 20 0.360706 0.272854
Code
knn_best_params = hyper_params_knn[(hyper_params_knn['Validation Score'] == hyper_params_knn['Validation Score'].max())]
knn_best_params
No of Neighbors Training Score Validation Score
18 20 0.360706 0.272854
Code
knn_model = KNeighborsClassifier(n_neighbors = knn_best_params['No of Neighbors'].values[0])
knn_model.fit(X_train, y_train)
knn_model.score(X_val, y_val)
0.27285357043623226

Models with Feature Engineering

Code
!pip3 install xgboost
Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
     |████████████████████████████████| 192.9 MB 86 kB/s  eta 0:00:014    |██████▊                         | 40.3 MB 6.1 MB/s eta 0:00:25     |██████████▏                     | 61.4 MB 7.8 MB/s eta 0:00:17     |██████████████▌                 | 87.7 MB 683 kB/s eta 0:02:34     |█████████████████████▋          | 130.0 MB 7.4 MB/s eta 0:00:09
Requirement already satisfied: scipy in /opt/conda/lib/python3.9/site-packages (from xgboost) (1.7.3)
Requirement already satisfied: numpy in /opt/conda/lib/python3.9/site-packages (from xgboost) (1.21.5)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1
Code
df.head()
Artist Name Track Name Popularity danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo duration_in_ms time_signature Class
0 Bruno Mars That's What I Like (feat. Gucci Mane) 60.0 0.854 0.564 1.0 -1.705862 1 0.220227 0.130767 -5.544218 -2.466281 0.8990 134.071 12.365620 4 5
1 Boston Hitch a Ride 54.0 0.382 0.814 3.0 -1.933657 1 0.201494 0.033166 -5.518964 -2.292635 0.5690 116.454 12.436124 4 10
2 The Raincoats No Side to Fall In 35.0 0.434 0.614 6.0 -2.027455 1 0.229129 0.697137 -8.537396 -0.931404 0.7870 147.681 11.605204 4 6
3 Deno Lingo (feat. J.I & Chunkz) 66.0 0.853 0.597 10.0 -1.868931 0 0.235584 0.145602 -5.544218 -2.103734 0.5690 107.033 12.066627 4 5
4 Red Hot Chili Peppers Nobody Weird Like Me - Remastered 53.0 0.167 0.975 2.0 -1.623482 1 0.464758 0.013000 -4.128936 -1.760261 0.0918 199.060 12.345661 4 10
Code
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#lenght of Artist Name and Track Name
df['len_artist_name'] = df['Artist Name'].apply(len)
df['len_track_name'] = df['Track Name'].apply(len)    
# Words count in Artist Name and Track Name         
df['artist_word_count'] = df['Artist Name'].str.split().str.len()
df['track_word_count'] = df['Track Name'].str.split().str.len()
# Count Digits in Artist Name and Track Name
df['artist_digit_count'] = df['Artist Name'].str.findall(r'[0-9]').str.len() 
df['track_digit_count'] = df['Track Name'].str.findall(r'[0-9]').str.len()

df['artist_and_track'] = df['Artist Name'] + ' '  + df['Track Name']
Code
columns = ["Artist Name","Track Name"]
enc = LabelEncoder()
for col in columns:
    df[col] = enc.fit_transform(df[col])
Code
count_vect = CountVectorizer()
artists_and_track_counts = count_vect.fit_transform(df['artist_and_track'])

tf_transformer = TfidfTransformer(use_idf=False).fit(artists_and_track_counts)
df['artist_and_track'] = tf_transformer.transform(artists_and_track_counts).toarray()

XGBoost

Code
import xgboost as xgb

y = df["Class"]
X = df.drop("Class", axis=1)
Code
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 11)
Code
clf = xgb.XGBClassifier()
Code
clf.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)
Code
clf.score(X_val, y_val)
0.5031953320366769

AutoML

Code
from pprint import pprint
import sklearn
import autosklearn
import autosklearn.classification
from autosklearn.pipeline.components.classification import ClassifierChoice
Code
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=1200)
Code
automl.fit(X_train, y_train)
/opt/conda/lib/python3.9/site-packages/autosklearn/metalearning/metalearning/meta_base.py:68: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  self.metafeatures = self.metafeatures.append(metafeatures)
/opt/conda/lib/python3.9/site-packages/autosklearn/metalearning/metalearning/meta_base.py:72: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  self.algorithm_runs[metric].append(runs)
AutoSklearnClassifier(per_run_time_limit=120, time_left_for_this_task=1200)
Code
pprint(automl.show_models(), indent=4)
{   11: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb5da71f0>,
            'cost': 0.92527888865502,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb84e09d0>,
            'ensemble_weight': 0.02,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb5d9bc10>,
            'model_id': 11,
            'rank': 8,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=False,
                               l2_regularization=0.0014846041861678746,
                               learning_rate=0.13443662321690814, max_iter=512,
                               max_leaf_nodes=47, min_samples_leaf=8,
                               n_iter_no_change=0, random_state=1,
                               validation_fraction=None, warm_start=True)},
    14: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb84617c0>,
            'cost': 0.47295306251315516,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb99422b0>,
            'ensemble_weight': 0.12,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb84618b0>,
            'model_id': 14,
            'rank': 1,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=9.674948183980905e-09,
                               learning_rate=0.014247987845444413, max_iter=256,
                               max_leaf_nodes=55, min_samples_leaf=164,
                               n_iter_no_change=1, random_state=1,
                               validation_fraction=0.11770489601182355,
                               warm_start=True)},
    18: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb5dd81f0>,
            'cost': 0.91096611239739,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb8a4e310>,
            'ensemble_weight': 0.46,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb5dcdf70>,
            'model_id': 18,
            'rank': 7,
            'sklearn_classifier': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.0285269469336831, n_estimators=105,
                   random_state=1)},
    23: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb8a61670>,
            'cost': 0.566617554199116,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb89c1a60>,
            'ensemble_weight': 0.06,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb8a610d0>,
            'model_id': 23,
            'rank': 5,
            'sklearn_classifier': PassiveAggressiveClassifier(C=0.0007163174331946707, max_iter=128,
                            random_state=1, tol=1.0000041320668022e-05,
                            warm_start=True)},
    27: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb89911f0>,
            'cost': 0.5083140391496528,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb89bffd0>,
            'ensemble_weight': 0.06,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb8976d30>,
            'model_id': 27,
            'rank': 4,
            'sklearn_classifier': MLPClassifier(activation='tanh', alpha=0.0288316953521873, beta_1=0.999,
              beta_2=0.9, hidden_layer_sizes=(91,),
              learning_rate_init=0.0002309951929622804, max_iter=256,
              n_iter_no_change=32, random_state=1, validation_fraction=0.0,
              verbose=0, warm_start=True)},
    29: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb89e9ca0>,
            'cost': 0.49610608292990954,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb841fdc0>,
            'ensemble_weight': 0.02,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb89e9a60>,
            'model_id': 29,
            'rank': 3,
            'sklearn_classifier': RandomForestClassifier(criterion='entropy', max_features=4, min_samples_leaf=7,
                       min_samples_split=20, n_estimators=512, n_jobs=1,
                       random_state=1, warm_start=True)},
    31: {   'balancing': Balancing(random_state=1),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb8a4c880>,
            'cost': 0.47695222058514,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb9a74e20>,
            'ensemble_weight': 0.02,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb8a4cc40>,
            'model_id': 31,
            'rank': 2,
            'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=False,
                               l2_regularization=1.0044517952196657e-10,
                               learning_rate=0.011804379532130016, max_iter=512,
                               max_leaf_nodes=55, min_samples_leaf=164,
                               n_iter_no_change=0, random_state=1,
                               validation_fraction=None, warm_start=True)},
    33: {   'balancing': Balancing(random_state=1, strategy='weighting'),
            'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7feeb8503d30>,
            'cost': 0.7272153230898758,
            'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7feeb8986e50>,
            'ensemble_weight': 0.24,
            'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7feeb8503790>,
            'model_id': 33,
            'rank': 6,
            'sklearn_classifier': QuadraticDiscriminantAnalysis(reg_param=0.7998081178586495)}}

we can see here that we have eight models, HistGradientBoostingClassifier with three different set of hyperparameters, PassiveAggressiveClassifier, AdaBoostClassifier, RandomForestClassifier, QuadraticDiscriminantAnalysis and MLPClassifier.

Code
profiler_data = PipelineProfiler.import_autosklearn(automl)
PipelineProfiler.plot_pipeline_matrix(profiler_data)
Code
print(automl.leaderboard())
          rank  ensemble_weight                type      cost   duration
model_id                                                                
14           1             0.12   gradient_boosting  0.472953  20.949500
31           2             0.02   gradient_boosting  0.476952  44.664088
29           3             0.02       random_forest  0.496106  18.046945
27           4             0.06                 mlp  0.508314  27.220666
23           5             0.06  passive_aggressive  0.566618   7.123063
33           6             0.24                 qda  0.727215   1.017169
18           7             0.46            adaboost  0.910966  11.412913
11           8             0.02   gradient_boosting  0.925279  42.281444
Code
from sklearn.metrics import f1_score, precision_score, recall_score
y_hat = automl.predict(X_val)
score = f1_score(y_val, y_hat, average="micro")
print('F-Measure: ', score)
F-Measure:  0.5254237288135594

Results and Conclusion

Model Selection

We have selected an ensemble models, that was built using AutoSklearn with the following models: - HistGradientBoostingClassifier with three different set of hyperparameters - PassiveAggressiveClassifier - AdaBoostClassifier - RandomForestClassifier - QuadraticDiscriminantAnalysis - MLPClassifier.

And it has a F1-score: 52.5%

Conclusions